{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对活动进行聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1918771225</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1502284248</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3044012</td>\n",
       "      <td>2529072432</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3044012</td>\n",
       "      <td>3072478280</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3044012</td>\n",
       "      <td>1390707377</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-10-02 15:53:05.754000+00:00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp  interested  \\\n",
       "0  3044012  1918771225        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "1  3044012  1502284248        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "2  3044012  2529072432        0  2012-10-02 15:53:05.754000+00:00           1   \n",
       "3  3044012  3072478280        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "4  3044012  1390707377        0  2012-10-02 15:53:05.754000+00:00           0   \n",
       "\n",
       "   not_interested  \n",
       "0               0  \n",
       "1               0  \n",
       "2               0  \n",
       "3               0  \n",
       "4               0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "train = pd.read_csv(\"train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 15398 entries, 0 to 15397\n",
      "Data columns (total 6 columns):\n",
      "user              15398 non-null int64\n",
      "event             15398 non-null int64\n",
      "invited           15398 non-null int64\n",
      "timestamp         15398 non-null object\n",
      "interested        15398 non-null int64\n",
      "not_interested    15398 non-null int64\n",
      "dtypes: int64(5), object(1)\n",
      "memory usage: 721.9+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 没有缺失值，46000多条数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>event</th>\n",
       "      <th>invited</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1776192</td>\n",
       "      <td>2877501688</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1776192</td>\n",
       "      <td>3025444328</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1776192</td>\n",
       "      <td>4078218285</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1776192</td>\n",
       "      <td>1024025121</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:01.230000+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1776192</td>\n",
       "      <td>2972428928</td>\n",
       "      <td>0</td>\n",
       "      <td>2012-11-30 11:39:21.985000+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user       event  invited                         timestamp\n",
       "0  1776192  2877501688        0  2012-11-30 11:39:01.230000+00:00\n",
       "1  1776192  3025444328        0  2012-11-30 11:39:01.230000+00:00\n",
       "2  1776192  4078218285        0  2012-11-30 11:39:01.230000+00:00\n",
       "3  1776192  1024025121        0  2012-11-30 11:39:01.230000+00:00\n",
       "4  1776192  2972428928        0  2012-11-30 11:39:21.985000+00:00"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "test = pd.read_csv(\"test.csv\")\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 10237 entries, 0 to 10236\n",
      "Data columns (total 4 columns):\n",
      "user         10237 non-null int64\n",
      "event        10237 non-null int64\n",
      "invited      10237 non-null int64\n",
      "timestamp    10237 non-null object\n",
      "dtypes: int64(3), object(1)\n",
      "memory usage: 320.0+ KB\n"
     ]
    }
   ],
   "source": [
    "test.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用户数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>locale</th>\n",
       "      <th>birthyear</th>\n",
       "      <th>gender</th>\n",
       "      <th>joinedAt</th>\n",
       "      <th>location</th>\n",
       "      <th>timezone</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3197468391</td>\n",
       "      <td>id_ID</td>\n",
       "      <td>1993</td>\n",
       "      <td>male</td>\n",
       "      <td>2012-10-02T06:40:55.524Z</td>\n",
       "      <td>Medan  Indonesia</td>\n",
       "      <td>480.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3537982273</td>\n",
       "      <td>id_ID</td>\n",
       "      <td>1992</td>\n",
       "      <td>male</td>\n",
       "      <td>2012-09-29T18:03:12.111Z</td>\n",
       "      <td>Medan  Indonesia</td>\n",
       "      <td>420.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>823183725</td>\n",
       "      <td>en_US</td>\n",
       "      <td>1975</td>\n",
       "      <td>male</td>\n",
       "      <td>2012-10-06T03:14:07.149Z</td>\n",
       "      <td>Stratford  Ontario</td>\n",
       "      <td>-240.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1872223848</td>\n",
       "      <td>en_US</td>\n",
       "      <td>1991</td>\n",
       "      <td>female</td>\n",
       "      <td>2012-11-04T08:59:43.783Z</td>\n",
       "      <td>Tehran  Iran</td>\n",
       "      <td>210.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3429017717</td>\n",
       "      <td>id_ID</td>\n",
       "      <td>1995</td>\n",
       "      <td>female</td>\n",
       "      <td>2012-09-10T16:06:53.132Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>420.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id locale birthyear  gender                  joinedAt  \\\n",
       "0  3197468391  id_ID      1993    male  2012-10-02T06:40:55.524Z   \n",
       "1  3537982273  id_ID      1992    male  2012-09-29T18:03:12.111Z   \n",
       "2   823183725  en_US      1975    male  2012-10-06T03:14:07.149Z   \n",
       "3  1872223848  en_US      1991  female  2012-11-04T08:59:43.783Z   \n",
       "4  3429017717  id_ID      1995  female  2012-09-10T16:06:53.132Z   \n",
       "\n",
       "             location  timezone  \n",
       "0    Medan  Indonesia     480.0  \n",
       "1    Medan  Indonesia     420.0  \n",
       "2  Stratford  Ontario    -240.0  \n",
       "3        Tehran  Iran     210.0  \n",
       "4                 NaN     420.0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "users = pd.read_csv(\"users.csv\")\n",
    "users.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 38209 entries, 0 to 38208\n",
      "Data columns (total 7 columns):\n",
      "user_id      38209 non-null int64\n",
      "locale       38209 non-null object\n",
      "birthyear    38209 non-null object\n",
      "gender       38100 non-null object\n",
      "joinedAt     38152 non-null object\n",
      "location     32745 non-null object\n",
      "timezone     37773 non-null float64\n",
      "dtypes: float64(1), int64(1), object(5)\n",
      "memory usage: 2.0+ MB\n"
     ]
    }
   ],
   "source": [
    "users.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "共3.8w条记录\n",
    "\n",
    "gender、joinedAt、location、timezone这几个特征有缺失值 所以需要做缺失值处理\n",
    "\n",
    "用户数比测试集和训练集中出现的用户多 为节省空间和时间，竞赛中可以只取出训练集和测试集中有的用户 （猜测event也是一样，因为events.csv以gz压缩格式给出，记录数目应该更多）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 活动数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask.dataframe as dd\n",
    "\n",
    "events = dd.read_csv('events.csv')\n",
    "#读取数据\n",
    "# events = pd.read_csv(\"events.csv\")\n",
    "# events.head()\n",
    "## 尝试用一次读取方式结果失败，文件太大，于是用分块读取的方式\n",
    "# reader = pd.read_csv(\"events.csv\",iterator = True)\n",
    "# loop = True\n",
    "# chunkSize = 1\n",
    "# # chunks = []\n",
    "# while loop:\n",
    "#     try:\n",
    "#         chunk = reader.get_chunk(chunkSize)\n",
    "#         for i in range(len(users.columns)):\n",
    "#              if users['user_id']==chunk['user_id']:\n",
    "#                 print(use_ID)\n",
    "# #         chunks.append(chunk)\n",
    "#     except StopIteration:\n",
    "#         loop = False\n",
    "#         print(\"Iteration is stopped.\")\n",
    "# df = pd.concat(chunks, ignore_index=True)\n",
    "# print(df)\n",
    "# 结果还是不行，接着试验用按行读取，一行一行读\n",
    "# chunks = []\n",
    "# with open(\"events.csv\", 'rb') as f:\n",
    "#     for line in f:\n",
    "#         for i in range(len(users.columns)):\n",
    "#              if users['user_id']==line:\n",
    "#                 print(use_ID)\n",
    "#             if use_ID == line:\n",
    "#                 chunks.append(line)    \n",
    "# df = pd.concat(chunks, ignore_index=True)\n",
    "# reader = pd.read_csv('events.csv', iterator=True)\n",
    "# try:\n",
    "#     df = reader.get_chunk(1000000)\n",
    "# except StopIteration:\n",
    "#     print (\"Iteration is stopped.\" )\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 实验了很多方法，都没有成功，最后用desk成功了\n",
    "https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'dask.dataframe.core.DataFrame'>\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: object(5), float64(2), int64(103)"
     ]
    }
   ],
   "source": [
    "events.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## desk也看不出来一共有多少行"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 活动参加者数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event</th>\n",
       "      <th>yes</th>\n",
       "      <th>maybe</th>\n",
       "      <th>invited</th>\n",
       "      <th>no</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1159822043</td>\n",
       "      <td>1975964455 252302513 4226086795 3805886383 142...</td>\n",
       "      <td>2733420590 517546982 1350834692 532087573 5831...</td>\n",
       "      <td>1723091036 3795873583 4109144917 3560622906 31...</td>\n",
       "      <td>3575574655 1077296663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>686467261</td>\n",
       "      <td>2394228942 2686116898 1056558062 3792942231 41...</td>\n",
       "      <td>1498184352 645689144 3770076778 331335845 4239...</td>\n",
       "      <td>1788073374 733302094 1830571649 676508092 7081...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1186208412</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3320380166 3810793697</td>\n",
       "      <td>1379121209 440668682</td>\n",
       "      <td>1728988561 2950720854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2621578336</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>855842686</td>\n",
       "      <td>2406118796 3550897984 294255260 1125817077 109...</td>\n",
       "      <td>2671721559 1761448345 2356975806 2666669465 10...</td>\n",
       "      <td>1518670705 880919237 2326414227 2673818347 332...</td>\n",
       "      <td>3500235232</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        event                                                yes  \\\n",
       "0  1159822043  1975964455 252302513 4226086795 3805886383 142...   \n",
       "1   686467261  2394228942 2686116898 1056558062 3792942231 41...   \n",
       "2  1186208412                                                NaN   \n",
       "3  2621578336                                                NaN   \n",
       "4   855842686  2406118796 3550897984 294255260 1125817077 109...   \n",
       "\n",
       "                                               maybe  \\\n",
       "0  2733420590 517546982 1350834692 532087573 5831...   \n",
       "1  1498184352 645689144 3770076778 331335845 4239...   \n",
       "2                              3320380166 3810793697   \n",
       "3                                                NaN   \n",
       "4  2671721559 1761448345 2356975806 2666669465 10...   \n",
       "\n",
       "                                             invited                     no  \n",
       "0  1723091036 3795873583 4109144917 3560622906 31...  3575574655 1077296663  \n",
       "1  1788073374 733302094 1830571649 676508092 7081...                    NaN  \n",
       "2                               1379121209 440668682  1728988561 2950720854  \n",
       "3                                                NaN                    NaN  \n",
       "4  1518670705 880919237 2326414227 2673818347 332...             3500235232  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "event_attendees = pd.read_csv(\"event_attendees.csv\")\n",
    "event_attendees.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 24144 entries, 0 to 24143\n",
      "Data columns (total 5 columns):\n",
      "event      24144 non-null int64\n",
      "yes        22160 non-null object\n",
      "maybe      20977 non-null object\n",
      "invited    22322 non-null object\n",
      "no         17485 non-null object\n",
      "dtypes: int64(1), object(4)\n",
      "memory usage: 943.2+ KB\n"
     ]
    }
   ],
   "source": [
    "event_attendees.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "缺失数据很多（缺失值表示没有用户）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用户好友数据\n",
    "user_friends.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>friends</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3197468391</td>\n",
       "      <td>1346449342 3873244116 4226080662 1222907620 54...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3537982273</td>\n",
       "      <td>1491560444 395798035 2036380346 899375619 3534...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>823183725</td>\n",
       "      <td>1484954627 1950387873 1652977611 4185960823 42...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1872223848</td>\n",
       "      <td>83361640 723814682 557944478 1724049724 253059...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3429017717</td>\n",
       "      <td>4253303705 2130310957 1838389374 3928735761 71...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         user                                            friends\n",
       "0  3197468391  1346449342 3873244116 4226080662 1222907620 54...\n",
       "1  3537982273  1491560444 395798035 2036380346 899375619 3534...\n",
       "2   823183725  1484954627 1950387873 1652977611 4185960823 42...\n",
       "3  1872223848  83361640 723814682 557944478 1724049724 253059...\n",
       "4  3429017717  4253303705 2130310957 1838389374 3928735761 71..."
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "user_friends = pd.read_csv(\"user_friends.csv\")\n",
    "user_friends.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 38202 entries, 0 to 38201\n",
      "Data columns (total 2 columns):\n",
      "user       38202 non-null int64\n",
      "friends    38063 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 597.0+ KB\n"
     ]
    }
   ],
   "source": [
    "user_friends.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 存在缺失值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用户和活动关联关系处理\n",
    "整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存数据\n",
    "import pickle as cPickle # python 3.x应该这样导入\n",
    "\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "    \n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'r')  # python3.x应该这样读文件，不要用2进制读取\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n",
    "\n",
    "#用户关系矩阵表，可用于后续LFM/SVD++处理的输入\n",
    "#这是一个稀疏矩阵，记录用户对活动感兴趣\n",
    "userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))\n",
    "userIndex = dict()\n",
    "eventIndex = dict()\n",
    "\n",
    "#重新编码用户索引字典\n",
    "for i, u in enumerate(uniqueUsers):\n",
    "    userIndex[u] = i\n",
    "    \n",
    "#重新编码活动索引字典    \n",
    "for i, e in enumerate(uniqueEvents):\n",
    "    eventIndex[e] = i\n",
    "\n",
    "n_records = 0\n",
    "ftrain = open(\"train.csv\", 'r')\n",
    "ftrain.readline()\n",
    "for line in ftrain:\n",
    "    cols = line.strip().split(\",\")\n",
    "    i = userIndex[cols[0]]  #用户\n",
    "    j = eventIndex[cols[1]] #活动\n",
    "    \n",
    "    eventsForUser[i].add(j)    #该用户参加了这个活动\n",
    "    usersForEvent[j].add(i)    #该活动被用户参加\n",
    "        \n",
    "    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested\n",
    "    score = int(cols[4])\n",
    "    #if score == 0:  #0在稀疏矩阵中表示该元素不存在，因此借用-1表示interested=0\n",
    "    #userEventScores[i, j] = -1\n",
    "    #else:\n",
    "    userEventScores[i, j] = score\n",
    "ftrain.close()\n",
    "\n",
    "  \n",
    "##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户\n",
    "cPickle.dump(eventsForUser, open(\"PE_eventsForUser.pkl\", 'wb'))\n",
    "##统计活动参加的用户\n",
    "cPickle.dump(usersForEvent, open(\"PE_usersForEvent.pkl\", 'wb'))\n",
    "\n",
    "#保存用户-活动关系矩阵R，以备后用\n",
    "sio.mmwrite(\"PE_userEventScores\", userEventScores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "cPickle.dump(userIndex, open(\"PE_userIndex.pkl\", 'wb'))\n",
    "#保存活动索引表\n",
    "cPickle.dump(eventIndex, open(\"PE_eventIndex.pkl\", 'wb'))\n",
    "\n",
    "    \n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueEventPairs = set()\n",
    "for event in uniqueEvents:\n",
    "    i = eventIndex[event]\n",
    "    users = usersForEvent[i]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in uniqueUsers:\n",
    "    u = userIndex[user]\n",
    "    events = eventsForUser[u]\n",
    "    if len(events) > 2:\n",
    "        uniqueEventPairs.update(itertools.combinations(events, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "cPickle.dump(uniqueUserPairs, open(\"FE_uniqueUserPairs.pkl\", 'wb'))\n",
    "cPickle.dump(uniqueEventPairs, open(\"PE_uniqueEventPairs.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入kmeans，minibatchkmeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import MiniBatchKMeans, KMeans\n",
    "#event的特征需要编码\n",
    "from utils import FeatureEng\n",
    "from sklearn.preprocessing import normalize\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 统计活动数目"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of records :3137972\n"
     ]
    }
   ],
   "source": [
    "lines = 0\n",
    "fin = open(\"events.csv\", 'r')\n",
    "#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count\n",
    "fin.readline() # skip header，列名行\n",
    "for line in fin:\n",
    "    cols = line.strip().split(\",\")\n",
    "    lines += 1\n",
    "fin.close()\n",
    "\n",
    "print(\"number of records :%d\" % lines)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "活动数目太多（300w+），训练+测试集的活动没这么多，所以先去处理train和test，得到竞赛需要用到的活动和用户 然后对在训练集和测试集中出现过的活动和用户建立新的ID索引 先运行user_event.ipynb, 得到活动列表文件：PE_eventIndex.pkl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "读取之前算好的测试集和训练集中出现过的活动"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of events in train & test :13418\n"
     ]
    }
   ],
   "source": [
    "eventIndex = cPickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "n_events = len(eventIndex)\n",
    "\n",
    "print(\"number of events in train & test :%d\" % n_events)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理events.csv --> 特征编码、活动之间的相似度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "FE = FeatureEng()\n",
    "\n",
    "fin = open(\"events.csv\", 'r')\n",
    "\n",
    "#字段：event_id, user_id,start_time, city, state, zip, country, lat, and lng， 101 columns of words count\n",
    "fin.readline() # skip header\n",
    "\n",
    "#start_time, city, state, zip, country, lat, and lng\n",
    "eventPropMatrix = ss.dok_matrix((n_events, 7))\n",
    "\n",
    "#词频特征\n",
    "eventContMatrix = ss.dok_matrix((n_events, 101))\n",
    "\n",
    "for line in fin.readlines():\n",
    "    cols = line.strip().split(\",\")\n",
    "    eventId = str(cols[0])\n",
    "    \n",
    "#     if eventIndex.has_key(eventId):  #在训练集或测试集中出现\n",
    "    if eventId in eventIndex:  #在训练集或测试集中出现\n",
    "        i = eventIndex[eventId]\n",
    "  \n",
    "        #event的特征编码，这里只是简单处理，其实开始时间，地点等信息很重要\n",
    "        eventPropMatrix[i, 0] = FE.getJoinedYearMonth(cols[2].encode()) # start_time\n",
    "        eventPropMatrix[i, 1] = FE.getFeatureHash(cols[3].encode()) # city\n",
    "        eventPropMatrix[i, 2] = FE.getFeatureHash(cols[4].encode()) # state\n",
    "        eventPropMatrix[i, 3] = FE.getFeatureHash(cols[5].encode()) # zip\n",
    "        eventPropMatrix[i, 4] = FE.getFeatureHash(cols[6].encode()) # country\n",
    "        eventPropMatrix[i, 5] = FE.getFloatValue(cols[7].encode()) # lat\n",
    "        eventPropMatrix[i, 6] = FE.getFloatValue(cols[8].encode()) # lon\n",
    "        \n",
    "        #词频\n",
    "        for j in range(9, 110):\n",
    "            eventContMatrix[i, j-9] = cols[j]\n",
    "fin.close()\n",
    "\n",
    "#用L2模归一化,Kmeans聚类基于L2距离\n",
    "eventPropMatrix = normalize(eventPropMatrix,\n",
    "    norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"EV_eventPropMatrix\", eventPropMatrix)\n",
    "\n",
    "#词频，可以考虑我们用这部分特征进行聚类，得到活动的genre\n",
    "eventContMatrix = normalize(eventContMatrix,\n",
    "    norm=\"l2\", axis=0, copy=False)\n",
    "sio.mmwrite(\"EV_eventContMatrix\", eventContMatrix)\n",
    "\n",
    "\n",
    "# calculate similarity between event pairs based on the two matrices\n",
    "eventPropSim = ss.dok_matrix((n_events, n_events))\n",
    "eventContSim = ss.dok_matrix((n_events, n_events))\n",
    "\n",
    "#读取在测试集和训练集中出现的活动对\n",
    "uniqueEventPairs = cPickle.load(open(\"PE_uniqueEventPairs.pkl\", 'rb'))\n",
    "\n",
    "for e1, e2 in uniqueEventPairs:\n",
    "    #i = eventIndex[e1]\n",
    "    #j = eventIndex[e2]\n",
    "    i = e1\n",
    "    j = e2\n",
    "    \n",
    "    #非词频特征，采用Person相关系数作为相似度\n",
    "    if (i,j) not in eventPropSim:\n",
    "        epsim = ssd.correlation(eventPropMatrix.getrow(i).todense(),\n",
    "            eventPropMatrix.getrow(j).todense())\n",
    "        \n",
    "        eventPropSim[i, j] = epsim\n",
    "        eventPropSim[j, i] = epsim\n",
    "    \n",
    "    #对词频特征，采用余弦相似度，也可以用直方图交/Jacard相似度\n",
    "    if (i,j) not in eventContSim:\n",
    "        ecsim = ssd.cosine(eventContMatrix.getrow(i).todense(),\n",
    "            eventContMatrix.getrow(j).todense())\n",
    "    \n",
    "        eventContSim[i, j] = epsim\n",
    "        eventContSim[j, i] = epsim\n",
    "    \n",
    "sio.mmwrite(\"EV_eventPropSim\", eventPropSim)\n",
    "sio.mmwrite(\"EV_eventContSim\", eventContSim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0., 0., 0., ..., 0., 0., 0.]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eventPropSim.getrow(0).todense()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 尝试使用minibatchkmeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据\n",
    "import scipy.io as sio\n",
    "from sklearn import metrics\n",
    "eventContMatrix = sio.mmread(\"EV_eventContMatrix\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，并评价聚类算法性能\n",
    "def K_cluster_analysis(K, df):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    km = MiniBatchKMeans(n_clusters = K)\n",
    "    km.fit(df)\n",
    "    \n",
    "    #保存预测结果\n",
    "    cluster_result = km.predict(df)\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(df,cluster_result)   \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "\n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.13347654847573215\n",
      "K-means begin with clusters: 20\n",
      "CH_score: -0.09102595032169149\n",
      "K-means begin with clusters: 30\n",
      "CH_score: -0.019213768525758488\n",
      "K-means begin with clusters: 40\n",
      "CH_score: -0.1397889987310736\n",
      "K-means begin with clusters: 50\n",
      "CH_score: -0.015116655158589006\n",
      "K-means begin with clusters: 60\n",
      "CH_score: -0.11369471777681084\n",
      "K-means begin with clusters: 70\n",
      "CH_score: -0.0795374308413839\n",
      "K-means begin with clusters: 80\n",
      "CH_score: -0.14273977487076292\n",
      "K-means begin with clusters: 90\n",
      "CH_score: -0.19732115643079412\n",
      "K-means begin with clusters: 100\n",
      "CH_score: -0.10384022332356085\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "CH_scores = []\n",
    "Ks = [10,20,30,40,50,60,70,80,90,100]\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, eventContMatrix)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.13347654847573215, -0.09102595032169149, -0.019213768525758488, -0.1397889987310736, -0.015116655158589006, -0.11369471777681084, -0.0795374308413839, -0.14273977487076292, -0.19732115643079412, -0.10384022332356085]\n"
     ]
    }
   ],
   "source": [
    "print (CH_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7f604a7f8588>]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD8CAYAAABzTgP2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XeYlOX1//H3AaSIBVEUIxaMGMQuK9jdtcQSI4nRRGMiiRqMJfaCIX6NmsSCKLGgIvZOjEZiEhti1Jgoi2JFAmrUFQMoiIKGtuf3xxl+7q677OzOM/NM+byua67ZmXnKYZjdM8997mLujoiIyAod0g5ARESKixKDiIg0osQgIiKNKDGIiEgjSgwiItKIEoOIiDSSSGIws/3NbLqZzTSz4c28voeZvWhmy8zs0CavLTezqZnbhCTiERGR9rNcxzGYWUfg38C+QB0wGTjC3d9osM0mwBrAmcAEd7+/wWsL3X21nIIQEZHEdErgGIOAme7+NoCZ3QsMAf5/YnD3/2Req0/gfCIikkdJJIYNgPcbPK4DBrdh/65mVgssAy5x9z81t5GZDQOGAXTv3n1g//792xmuiEhlmjJlykfu3qu17ZJIDNbMc21pn9rI3WeZ2abAk2b2qru/9ZUDuo8FxgJUVVV5bW1t+6IVEalQZvZuNtslUXyuAzZs8LgPMCvbnd19Vub+beApYPsEYhIRkXZKIjFMBvqZWV8z6wwcDmTVu8jM1jKzLpmf1wF2pUFtQkRECi/nxODuy4CTgEeBacB4d3/dzC40s4MBzGxHM6sDDgNuMLPXM7tvAdSa2cvAJKLGoMQgIpKinLurpkE1BhGRtjOzKe5e1dp2GvksIiKNKDGIiEgjSgwiItJIRSWGe+6B669POwoRkeJWUYnhgQfg4ouhBOvtIiIFU1GJoboa3nsP3nkn7UhERIpXRSWGmpq4f+qpVMMQESlqFZUYttgC1l0XJk1KOxIRkeJVUYnBLJqTnnpKdQYRkZZUVGKAaE6qq4O3vjJ/q4iIQAUmhurquFedQUSkeRWXGL7xDejdW3UGEZGWVFxiUJ1BRGTlKi4xQNQZZs2CGTPSjkREpPhUZGJQnUFEpGUVmRj69YOvfU11BhGR5lRkYlhRZ5g0SXUGEZGmKjIxQNQZZs+G6dPTjkREpLhUbGJYUWdQc5KISGMVmxi+/nXo00cFaBGRpio2MWg8g4hI8xJJDGa2v5lNN7OZZja8mdf3MLMXzWyZmR3a5LWhZjYjcxuaRDzZqqmBOXNg2rRCnlVEpLjlnBjMrCNwLXAAMAA4wswGNNnsPeAnwN1N9u0JnA8MBgYB55vZWrnGlC3VGUREviqJK4ZBwEx3f9vdlwD3AkMabuDu/3H3V4D6JvvuBzzu7vPcfT7wOLB/AjFlpW9f2Ggj1RlERBpKIjFsALzf4HFd5rlE9zWzYWZWa2a1c+fObVegXz1mNCc99RTUN01ZIiIVKonEYM08l205N+t93X2su1e5e1WvXr2yDq411dXw0UfwxhuJHVJEpKQlkRjqgA0bPO4DzCrAvolQnUFEpLEkEsNkoJ+Z9TWzzsDhwIQs930U+KaZrZUpOn8z81zBbLJJ3FRnEBEJOScGd18GnET8QZ8GjHf3183sQjM7GMDMdjSzOuAw4AYzez2z7zzgIiK5TAYuzDxXUKoziIh8ybwER3dVVVV5bW1tYse7/XYYOhRefhm22Saxw4qIFBUzm+LuVa1tV7EjnxtSnUFE5EtKDMRYhk03VWIQEQElhv+vpgaeflp1BhERJYaM6mqYPz/qDCIilUyJIUPrQIuIBCWGjD59YLPNVGcQEVFiaGBFnWH58rQjERFJjxJDA9XVsGABTJ2adiQiIulRYmhAdQYRESWGRr72Ndh8c9UZRKSyKTE0UVMDzzwDy5alHYmISDqUGJqoqYFPP4WXXko7EhGRdCgxNLHnnnGvOoOIVColhiZ694YttlCdQUQqlxJDM6qrVWcQkcqlxNCMmhpYuBCmTEk7EhGRwlNiaIbqDCJSyZQYmrHuurDllqoziEhlUmJoQXU1PPssLF2adiQiIoWlxNCCmhpYtAgSXFpaRKQkKDG0YEWdQc1JIlJpEkkMZra/mU03s5lmNryZ17uY2X2Z1583s00yz29iZl+Y2dTM7fok4knCOuvA1lurAC0ilSfnxGBmHYFrgQOAAcARZjagyWbHAPPdfTPgSuDSBq+95e7bZW4/zzWeJFVXwz/+AUuWpB2JiEjhJHHFMAiY6e5vu/sS4F5gSJNthgC3ZX6+H9jbzCyBc+dVTQ18/jlMnpx2JCIihZNEYtgAeL/B47rMc81u4+7LgAXA2pnX+prZS2b2dzPbvaWTmNkwM6s1s9q5c+cmEHbr9tgDzFRnEJHKkkRiaO6bv2e5zYfARu6+PXA6cLeZrdHcSdx9rLtXuXtVr169cgo4W2uvDdtsozqDiFSWJBJDHbBhg8d9gFktbWNmnYA1gXnuvtjdPwZw9ynAW8DmCcSUmBV1hsWL045ERKQwkkgMk4F+ZtbXzDoDhwMTmmwzARia+flQ4El3dzPrlSleY2abAv2AtxOIKTE1NfC//8ELL6QdiYhIYeScGDI1g5OAR4FpwHh3f93MLjSzgzOb3QSsbWYziSajFV1a9wBeMbOXiaL0z919Xq4xJUl1BhGpNObetBxQ/Kqqqry2gEOSd9gBevSAJ58s2ClFRBJnZlPcvaq17TTyOQs1NfDcc9GkJCJS7pQYslBdHcXn559POxIRkfxTYsjC7rtDhw6qM4hIZVBiyEKPHrD99koMIlIZlBiyVFMD//oXfPFF2pGIiOSXEkOWqqtjMr1//jPtSERE8kuJIUsr6gyaHkNEyp0SQ5bWWAMGDlSdQUTKnxJDG9TURJfVzz9POxIRkfxRYmiD6mpYujQGu4mIlCslhjbYbTfo2FF1BhEpb0oMbbD66lBVpTqDiJQ3JYY2qqmJKbgXLUo7EhGR/FBiaKPqali2LBbvEREpR0oMbbTrrtCpk+oMIlK+lBjaaLXVYNAg1RlEpHwpMbRDdTVMngwLF6YdiYhI8pQY2qGmBpYvh2efTTsSEZHkKTG0wy67wCqrqDlJRMqTEkM7rLoqDB6sArSIlCclhnaqroYpU+DTT9OOREQkWYkkBjPb38ymm9lMMxvezOtdzOy+zOvPm9kmDV47N/P8dDPbL4l4CkF1BhEpVzknBjPrCFwLHAAMAI4wswFNNjsGmO/umwFXApdm9h0AHA5sCewPjMkcr+jtvDN07qw6g4iUnySuGAYBM939bXdfAtwLDGmyzRDgtszP9wN7m5llnr/X3Re7+zvAzMzxil63brDTTqoziEj5SSIxbAC83+BxXea5Zrdx92XAAmDtLPcFwMyGmVmtmdXOnTs3gbBzV10NL74ICxakHYmISHKSSAzWzHOe5TbZ7BtPuo919yp3r+rVq1cbQ8yPmhqor4dnnkk7EhGR5CSRGOqADRs87gPMamkbM+sErAnMy3LforXTTtCli+oMIlJekkgMk4F+ZtbXzDoTxeQJTbaZAAzN/Hwo8KS7e+b5wzO9lvoC/YAXEoipILp2jSK06gwiUk5yTgyZmsFJwKPANGC8u79uZhea2cGZzW4C1jazmcDpwPDMvq8D44E3gEeAE919ea4xFVJ1Nbz0Esyfn3YkIiLJsPjiXlqqqqq8trY27TAAePpp2HNPeOghOPjg1rcXEUmLmU1x96rWttPI5xwNHhxNSqoziEi5UGLIUZcuMame6gwiUi6UGBJQUwMvvwzz5qUdibTX1KlwwgmwZEnakYikT4khAdXV4A5//3vakUh7DR8O110H11yTdiQi6VNiSMCgQTFFRqk1J737bgzQq3SvvAKPPhrLtl54IRTJwHqR1CgxJKBzZ9h119IqQD/8MPTtCxdckHYk6Rs1Crp3h8ceg0WL4Lzz0o5IJF1KDAmpqYFXX4WPPko7ktbNnAk/+lE0f111VWWvKVFXB3ffDcccE4MVTzwRbrwxriJEKpUSQ0Kqq+O+2OsMixbBd78LnTrB+PHwyScwdmzaUaXn6qujOe3UU+Px+edDjx7xuASH+IgkQokhITvuGEt+FnOdwR2OPRbeeAPuuQcOOwz22guuuAIWL047usL79FO4/no49NBoVgNYa62oM0yaBH/6U7rxiaRFiSEhq6wCu+1W3HWG0aPh3nvht7+FffeN5849Fz78EO64I93Y0jBuXCSHs85q/Pxxx8GWW8KZZ1ZmwhRRYkhQTQ28/jrMmZN2JF/11FPxB/C734Vzzvny+b33hoED4bLLYqnSSrF0aSTKPfeEqiYTBHTqBFdeCW+/HduIVBolhgQVa52hrg5+8APo1w9uvRWswSoYZtGHf8YMePDB1EIsuD/8Ad5/P64KmrPvvvDtb8NvfgP//W9hYxNJmxJDggYOjL7wxVRnWLw42tC/+CL+8K+xxle3+e53I2lcfHFlFFzd4fLLoX9/OPDAlrcbNSrevxEjChebSDFQYkhQMdYZTjkFnn8+rhT6929+m44d4eyzY5nSJ54oaHipmDQppko/4wzosJLfgH794OST4ZZb4r0RqRRKDAmrqYFp02D27LQjgZtvhhtuiKaiQw5Z+bY//jF87WtwySWFiS1NI0fCeuvFWI7WnHcerLNOJNhKuJoSASWGxK2oM6TdnFRbG5PC7bNPtJO3pksXOP10ePJJeKFk1tBru9deg0cegV/8IqZLb82aa8b79+yzUZcQqQRKDAnbYQdYffV0E8PcuXGF0Lt3jFfo2DG7/YYNi8Fdl16a3/jSNGpUjDf5+c+z3+eYY2DbbaO57Ysv8hebSLFQYkhYp06wxx7p1RmWLYMjjogusw88EM0g2Vp9dTjppChSv/lm/mJMy6xZcNddcPTRsPba2e/XsWN0W3333RgMKFLulBjyoLoapk+PgWOFNmIETJwYI3p32KHt+598cjSxXHZZ8rGl7aqrYqzGaae1fd/q6rgKu/jiSDAi5UyJIQ9qauK+0M1Jf/xj/EE//nj4yU/ad4xevaLp5M47o59/ufjss0iW3/sebLpp+44xcmQMjDv33GRjEyk2Sgx5sN12UbQsZHPSG29EMthpp9xH655xRkwsd+WViYRWFG66CRYsiH9be226aRTob7+9vAv0IjklBjPraWaPm9mMzP1aLWw3NLPNDDMb2uD5p8xsuplNzdzWzSWeYtGxY9QZCnXFsGBBDFJbdVW4//5YHyIXm2wCP/xhzLr68ceJhJiqZcsiye2+OwwenNuxfvnLKOpr9lUpZ7leMQwHJrp7P2Bi5nEjZtYTOB8YDAwCzm+SQI509+0ytyKcZah9qqtjmokPPsjveerr40rhrbeiO+UGGyRz3LPPjim6r702meOl6f774b33Wp7+oi1WXx1+9zv45z+jx5dIOco1MQwBbsv8fBvwnWa22Q943N3nuft84HFg/xzPW/QKVWe49NKYHnrUqLhKScpWW8VcQVddFQmiVLlHbeAb34CDDkrmmEOHxvQn55xT2u+NSEtyTQzrufuHAJn75pqCNgAaljHrMs+tcEumGek8s4bTuzVmZsPMrNbMaueWwKK822wTYwLyWWd47LHohfTDH0ZvoqQNHx5NSePGJX/sQnnqqZjOorXpL9qiQ4eo49TVRdIRKTet/qqY2RNm9loztyFZnqO5P/YrWmePdPetgd0ztx+3dBB3H+vuVe5e1atXryxPnZ6OHWNK53xdMbzzToxX2GqrqAW0nFLbb5ddol1+1ChYsiT54xfC5ZfDuuvGlB9J2m23mLH2ssvKq/eWCGSRGNx9H3ffqpnbQ8BsM1sfIHPfXI2gDtiwweM+wKzMsT/I3H8G3E3UIMpGdXW0/Sf9h+OLL6JPfX19DEbr3j3Z4zc0fHjEX4rt6a+/Dn/9awzay2b6i7a67LJoqmq4voVIOcj14noCsKKX0VDgoWa2eRT4ppmtlSk6fxN41Mw6mdk6AGa2CnAQ8FqO8RSVfNQZ3GM6h5dfjlG8X/96csduzgEHRLPYpZdGIiolV1wB3brFuI582GijWPzonnvguefycw6RNOSaGC4B9jWzGcC+mceYWZWZjQNw93nARcDkzO3CzHNdiATxCjAV+AC4Mcd4isrWW0PPnsnWGa67LvrRn3/+ytcSSMqKhXymTYM//zn/50vKhx/GIL2jj27btCBtdc450RPslFNKL3GKtMS8BDtjV1VVeW1tbdphZOWQQ2Dq1FgmMlfPPRd1i/33h4ceSq6Y2pply2DzzaOt/p//zE89I2m//GVMIT5jRv6vqu68M2oYt94aPZZEipWZTXH3qta208jnPKuujkLxu+/mdpz//jdWYtt4Y7jjjsIlBYiJAc86Kxb8KbZlS5uzcGFcWR1ySP6TAkSvsMGDY6qMhQvzfz6RfFNiyLMk6gxLl8Jhh8UI5wcfjG6whfaTn8QVQyks5HPzzfDJJ8kMaMtGhw7w+99H89XFFxfmnCL5pMSQZ1tuGVM851JnOPPMWCjmppuibpGGbt1iGohHH41lMYvViukvdt015o0qlMGDY0W4UaPiClGklCkx5FmHDtGc1N4rhjvvjNHHp50Ghx+eZGRtd/zxMSVEMS/k88c/wn/+E01fhXbJJV+uny1SypQYCqCmJmoMbf0m+fLLsarannsWxx/jHj1iudA//AFmzkw7mq9aMf1Fv34xnUehbbBB9OC6//7SqMWItESJoQBWrAPdluakefNixtSePeG++2CVVfISWpudckrEcvnlaUfyVU8/DVOmJDv9RVudeWaMbzj11FgUSKQUKTEUwIABsQBOts1Jy5fDkUfGXDz33w/rrZfX8Npk/fWjEH3LLemsULcyl18e7/NRR6UXQ7duMSJ66tR4j0RKkRJDAZjFVcOkSdnN4X/BBfDII3D11YUtoGbrzDOjyJvrgkBJmjYNHn44pr/o1i3dWL7//ZhLacQI+PTTdGMRaQ8lhgKpqYkrgNYGuk2YABddFCN2hw0rTGxttdlm0X32uuuiW2gxGDUq5kM64YS0I4kvAqNHw9y58JvfpB2NSNspMRRINnWGGTNiBO3AgbFATjGPMD7nnFhH+brr0o4kBv/dcQf89Kf5nf6iLQYOjCa30aOLs1AvsjJKDAXSv3/UClqqMyxcGMXmVVaJLpf5mA00SdtvH1NzjB4ds72m6ZprYhDgaaelG0dTv/sddOlSuIF2SfroIzWDVTIlhgJZWZ3BHY45JtrJ7703pr0oBcOHw5w5MUdQWhYtgjFj4DvfiW6qxaR376gzPPQQTJyYdjTZWbIkxmNsvDFUVUVzmFQeJYYCqqmBWbO+2rRwxRUwfnxMp7DPPunE1h577BHF8ZEjoxidhltugfnz0xnQlo1TT4W+feM+rfcoWxMnxhTr554b/7fvvw/f+paWL61ESgwF1FydYdKkaK//3veK949bS1ZMyf3OO5HYCm3Zskiqu+wCO+9c+PNno2vX6Eb72mtwY5FOKv/BB7Ea3T77xHv617/C3/4WV69TpkQvq6VL045SCsrdS+42cOBAL0X19e7rr+9+xBHx+L333Hv1cu/f3/3TT9ONrb2WL3ffYgv3rbeOf18hjR/vDu4PPFDY87ZVfb37nnu6r722+7x5aUfzpSVL3EeOdF9tNfeuXd0vvND9iy8ab3P99fEeH3104f9/JXlArWfxNzb1P/LtuZVqYnCPpNC7d/wC7rij++qru0+blnZUubn11vgk/eUvhTtnfX28f5tt5r5sWeHO214vveRu5n7qqWlHEiZNch8wIP7fDjrI/a23Wt72//4vtvvVrwoWnuSJEkORGjs23vX99iuNb7vZWLLEfcMN3XffvXDnfPrpeP/GjCncOXP1s5+5d+qU7heBWbPcjzwy3rtNNnGfMKH1ferr3Y89tvTeb/kqJYYi9e9/x7sO7ueem3Y0yfn97+Pf9OyzhTnfwQe7r7OO+6JFhTlfEmbPdl9jDfcDDyz8uZcudR89Os7fubP7eee17b1bujSuLMzK48tMpVJiKFL19e79+sUVQyk0gWRr4cJoQz/ooPyfa9q0+OSef37+z5W0kSMj9r/9rXDnfPZZ9222+fJK9d//bt9xFi50HzzYvUsX92eeSTZGKYxsE4N6JRWYGbz4Yszr07Fj2tEkp3t3OPnk+He9+mp+z3XFFcUz/UVbnXxyTCly2mn57+kzZ06Mvt5tt+jS+8c/Rm+j9o736N49/n833jimNX/jjUTDlSKixJCC1VaLdZTLzYknxh+Pyy7L3zlmz4bbb4ehQ2Op0VLTuXPM6/Tmm/mbTmT58phS5RvfgLvvji7F06bFGti5TrOyzjqxil/XrjHyva4umZiluOSUGMysp5k9bmYzMvdrtbDdI2b2iZk93OT5vmb2fGb/+8yscy7xSLrWXjsm/rvnnlhFLR+uvTZG555+en6OXwjf/naMGfj1r+Hjj5M99r/+BTvuGLPMDhwIr7wSAye7d0/uHJtsElcen3wCBxxQPBMpSnJyvWIYDkx0937AxMzj5owEftzM85cCV2b2nw8ck2M8krLTT49FckaNSv7Yn38eiWHIENh88+SPXyhmsS71ggWRHJLw0Ufws5/FQL/Zs2Nxp8cfjzm68mG77eDBB2H69Pj/+N//8nMeSUk2hYiWbsB0YP3Mz+sD01eybTXwcIPHBnwEdMo83hl4NJvzlnLxuRIcfXQMmJo9O9njXnNNYXs+5dsJJ7h37Oj+2mvtP8by5e433ODes2d0hT3zzMIOlrz77vg/OfTQ8upMUa4oUPF5PXf/MJNgPgTa0uq7NvCJu6+YQaYO2KCljc1smJnVmlntXM3sVdTOOgsWL4arrkrumMuXR9F5p51iCoxycMEFsPrqcZXlWSzg1FRtbbwfxx0HW28dq8aNHBnHLJQjjoirw/vvj4J6e/4dUnxaTQxm9oSZvdbMbUiO526uDNbix8rdx7p7lbtX9erVK8dTSz717x9TiF97bXJTN//pT7HI0ZlnFvc6FW2xzjrRlPTYY/CXv2S/37x5cPzxMGgQvPce3HlnzLm15ZZ5C3WlTj89bldfnd+OB5Vu2jTYay94660CnCyby4qWbqgpSVrw/PPRxDByZO7Hqq+P/vNf/3r5NVcsWRJzZfXr57548cq3Xb7c/aabYmBfhw7up5zi/sknhYmzNcuXx3Qv4H7bbWlHU37q691ratx79MitiZYCNSVNAIZmfh4KPNSGhOTAJODQ9uwvxW3QoPh2c8UV0ayUi+eeg+efj2+l5TT2A2JhpiuuiNX7rrmm5e2mTo3xCMccE4X3F1+MRZLWXLNwsa5Mhw4xBfree0eMjzySdkTl5d5746rwd78rUDftbLJHSzeiTjARmJG575l5vgoY12C7Z4C5wBdELWG/zPObAi8AM4E/AF2yOa+uGErD44/HN8gbb8ztOEOGxKjqUpr+oq0OOMB9zTXd58xp/Pz8+e6/+EVcIfTqFRMWLl+eTozZWLDAfbvt3Lt3d588Oe1oysOCBTErc1VV7lfMaEoMSVt9vfvAgdFM0t4P9Jtvxvw8552XbGzFZtq06FV03HHxuL7e/fbb3ddbL5LCiScW15TdKzNrVkzQt+667jNnph1N6Tv11PgdeOGF3I+VbWLQyGfJmxUL+cyYAQ880L5jXHlljBY+8cRkYys2/fvHv/HGG6PZYM894aijYjDZ5MnRzLRWs8NHi8/660dT0vLlsN9+MTWHtM8rr0RRf9iwGLhYKBZJpLRUVVV5bW1t2mFIFpYvhy22iC6UtbVt61E0Z07My3PUUXDDDfmLsVjMnx/zGH38cYwiv+QSOProaL8vRf/6V9SZttwy2sdXWy3tiEpLfX0ssTp9etx69sz9mGY2xd2rWtuuRD9yUio6doSzz45i6RNPtG3fMWNiRG0pT3/RFmutBbfeGuNApk+HY48t3aQAMcZi/Hh46SU49FAtD9pWt98O//gHXHppMkmhLXTFIHm3eDFsumk0l0ycmN0+n38OG20Eu+4KD6mvWkkbNy6m6zjqqEh85TIOJZ/mz49JEDfbDJ59NrkvCLpikKLRpUt863/ySXjhhez2ue22aFI588z8xib5d+yxMcr79tthxIi0oykNI0bE53/MmHSuGpUYpCCGDYMePeKyuDUrpr8YNCj67kvpO++8+AxcfPHKx2tI1OKuvz5myN1uu3RiUGKQglh99figP/hgrEWwMhMmwMyZ0dauZofyYPblzLgnnxxzK8lXLV8eC1Cttx5ceGF6cSgxSMGcfHIs8NLafDqXXw59+8Z8S1I+OnWKtTp23hl+9CN4+um0Iyo+48ZF9+TLL093VLsSgxRMr14xXcKdd8L77ze/zXPPxa0cp78Q6NYN/vznSPwHHwyvvZZ2RMVj7lw491yoroYf/jDdWJQYpKDOOCP6Z195ZfOvX355dM376U8LG5cUTs+eMQCue/dYHrSlLwmVZvhw+OyzaHJLuwlViUEKapNN4tvQ2LFfXdZyxoyYXvuEE5JdilKKz8Ybx/Kgn30WyWHevLQjStdzz8HNN8eaFgMGpB2NEoOk4OyzYdGi+GbU0JVXxmyjJ52UTlxSWNtsE2NUZs6MovQXX6QdUTqWLYsvQ336wP/9X9rRBCUGKbittoJvfztWeFu0KJ6bOzembT7qqOiRIZWhuhruuCNG+B55ZPTKqTRjxsDLL8cXo2KZNkSJQVIxfHg0JY0bF48rbfoL+dL3vx9/FB98MHquleBkDO324YcxxuOb34TvfS/taL6kxCCp2GUX2H33WC/4009j0NNBB8WEe1J5TjklmhjHjInFaCrFWWfFF6Jrrkm/4NyQEoOkZvjw6JEyZAh89FH8kkjluvjiGN/wq19Fs2K5e+opuOuuSIj9+qUdTWOaRE9S4x5D/l95Jeaaf/754vrWJIW3ZEnUnyZOjBHwBx6YdkT5sXRpfPY//xxefx1WXbUw59UkelL0zGJAD2j6CwmdO8d0GdtuC4cdlv2ki6Vm9Gh4443ogFGopNAWumKQVLlHj4xtt1VikC/9979Rh/rss+ixtPnmaUeUnLq6mIJ+r73iqqiQdMUgJcEsLqmVFKSh3r3h0Ufjc1FdHd+uy8Vpp0UG9FREAAAKOUlEQVS33N//Pu1IWpZTYjCznmb2uJnNyNw3uyqtmT1iZp+Y2cNNnr/VzN4xs6mZW0qTzIpIsenXL5YEdY/k8PLLaUeUu8cei6ayESNivqhilesVw3Bgorv3AyZmHjdnJPDjFl47y923y9ym5hiPiJSRLbeMWVi7dIGamliroFQtXhyj+vv1K/4eeLkmhiHAbZmfbwO+09xG7j4R+CzHc4lIBerXL5JDjx6w995RcyhFI0fGfGDXXBOJrpjlmhjWc/cPATL367bjGL81s1fM7EozK/K3S0TS0LdvJIfevWG//aKJqZS88w789rdw6KExyrnYtZoYzOwJM3utmduQBM5/LtAf2BHoCZyzkjiGmVmtmdXOnTs3gVOLSCnp0wf+/veYoffAA2Pq7lJxyimxvkhL080Xm1YTg7vv4+5bNXN7CJhtZusDZO7ntOXk7v6hh8XALcCglWw71t2r3L2qV69ebTmNiJSJ3r1jxHD//jFi/qGH0o6odX/+c9zOPz+SWynItSlpAjA08/NQoE3/TQ2SihH1Ca3nJCIrtc468OSTsP320TQzfnzaEbXsiy/iamHAADj11LSjyV6uieESYF8zmwHsm3mMmVWZ2bgVG5nZM8AfgL3NrM7M9su8dJeZvQq8CqwD/CbHeESkAqy1VnT93GknOOIIuP32tCNq3sUXR33h2mtjrZFSoZHPIlKyFi2KJqUnn4Trr4dhw9KO6EszZsTaI4cdFuucFwONfBaRste9Ozz8MBxwABx3XMw9VAzc4Re/gK5dYx3zUqPEICIlrWvXWOTnkEOiPf/SS9OOCB54IKb0uOiiKJiXGiUGESl5nTvDffdFvWH4cPj1r9NbCW7hwig0b7ddrOVcijqlHYCISBI6dYr1o7t2hQsuiB5Bl1xS+AkaL7ooZlAdPz5iKkUlGraIyFd17BjriHftCpddFslh9GjoUKC2kTfegCuugKOPhp13Lsw580GJQUTKSocO0T20W7f4I/2//0WPpXwnB3c48URYffW4UillSgwiUnbMojdQt24xR9H//gc335zfpp177olR2ddfD6U+OYMSg4iUJTP4zW8iOfzqV5Ec7rorPwPNFiyAM86ItcuPPTb54xeaEoOIlLURIyI5nHFGJIfx46MGkaTzz4fZs2NOpI4dkz12GtRdVUTK3umnw5gx8Yd7yBD4/PPkjv3yy3D11fDzn0NVq2OKS4MSg4hUhOOPh5tugscfh299K8Yb5Kq+PsYqrL121DLKhRKDiFSMo4+OOsMzz8SCOQsW5Ha8226D556LrrFrNbvifWlSYhCRinLEEVFnqK2NpUI//rh9x5k3D84+G3bZBY46KtkY06bEICIV55BDYn6l116DmhqY06YlxsKIEZEcxowp3AC6Qimzf46ISHa+9a2YmXXmTNhzT/jgg+z3nTwZbrghZlDddtv8xZgWJQYRqVj77BOzoNbVRXJ4993W91m+PArO660XczKVIyUGEalou+8OTzwRtYY99ogriJW58caoT4waBWuuWZgYC02JQUQq3uDBsQrcokWRHN58s/nt5s6FX/4y6hJHHFHYGAtJiUFEBNh++5jrqL4+ksMrr3x1m3POgc8+g2uuKfx03oWkxCAikrHVVvD007HwT00NTJny5Wv/+AfcckuMoh4wIL0YC0GJQUSkgc03j+Swxhqw114xgG3Zsig49+kD552XdoT5l1NiMLOeZva4mc3I3H9l7J+ZbWdm/zSz183sFTP7QYPX+prZ85n97zOzzrnEIyKShE03jeSw7roxQnrYsGhaGj0aVlst7ejyL9crhuHARHfvB0zMPG7qc+Aod98S2B8YbWY9Mq9dClyZ2X8+cEyO8YiIJGLDDSM5bLxxNCHtt18MjKsEuSaGIcBtmZ9vA77TdAN3/7e7z8j8PAuYA/QyMwP2Au5f2f4iImlZf/0oSJ9+OowdW94F54ZyXY9hPXf/EMDdPzSzdVe2sZkNAjoDbwFrA5+4+7LMy3XABjnGIyKSqF69YsxCJWk1MZjZE0DvZl4a0ZYTmdn6wB3AUHevz1wxNOUr2X8YMAxgo402asupRUSkDVpNDO6+T0uvmdlsM1s/c7WwPtFM1Nx2awB/AX7l7v/KPP0R0MPMOmWuGvoAs1YSx1hgLEBVVVWLCURERHKTa41hAjA08/NQ4KGmG2R6Gj0I3O7uf1jxvLs7MAk4dGX7i4hIYeWaGC4B9jWzGcC+mceYWZWZjcts831gD+AnZjY1c9su89o5wOlmNpOoOdyUYzwiIpIjiy/upaWqqspra2vTDkNEpKSY2RR3b3Vlao18FhGRRpQYRESkESUGERFppCRrDGY2F8hiraWitg7RZVf0XjSl96MxvR9fyvW92Njde7W2UUkmhnJgZrXZFIEqgd6LxvR+NKb340uFei/UlCQiIo0oMYiISCNKDOkZm3YARUTvRWN6PxrT+/GlgrwXqjGIiEgjumIQEZFGlBhERKQRJYY8M7MNzWySmU3LrHt9Sub5VtfLLmdm1tHMXjKzhzOPK3b9bzPrYWb3m9mbmc/JzpX6+TCz0zK/J6+Z2T1m1rWSPhtmdrOZzTGz1xo81+xnwcJVZjbTzF4xsx2SikOJIf+WAWe4+xbATsCJZjaA7NbLLmenANMaPK7k9b9/Dzzi7v2BbYn3peI+H2a2AXAyUOXuWwEdgcOprM/GrcD+TZ5r6bNwANAvcxsGXJdUEEoMeebuH7r7i5mfPyN+6Tcgi/Wyy5WZ9QG+BYzLPK7Y9b8zi1jtQWbKeXdf4u6fULmfj05ANzPrBKwKfEgFfTbc/WlgXpOnW/osDCHWufHMAmg9Mgum5UyJoYDMbBNge+B5mqyXDax0vewyMxo4G6jPPK7k9b83BeYCt2Sa1saZWXcq8PPh7h8AlwPvEQlhATCFyv1srNDSZ2ED4P0G2yX23igxFIiZrQb8ETjV3T9NO560mNlBwBx3n9Lw6WY2rZR+1J2AHYDr3H17YBEV0GzUnEzb+RCgL/A1oDvRXNJUpXw2WpO33xslhgIws1WIpHCXuz+QeXr2isu+la2XXYZ2BQ42s/8A9xLNBKPJrP+d2Wal63+XmTqgzt2fzzy+n0gUlfj52Ad4x93nuvtS4AFgFyr3s7FCS5+FOmDDBtsl9t4oMeRZpv38JmCau1/R4KVW18suR+5+rrv3cfdNiMLik+5+JBW6/re7/xd438y+kXlqb+ANKvPz8R6wk5mtmvm9WfFeVORno4GWPgsTgKMyvZN2AhasaHLKlUY+55mZ7QY8A7zKl23qvyTqDOOBjYhfiMPcvWnRqayZWTVwprsfZGabElcQPYGXgB+5++I04yuUzBro44DOwNvAT4kvbRX3+TCzC4AfEL35XgKOJdrNK+KzYWb3ANXE9NqzgfOBP9HMZyGTPK8hejF9DvzU3RNZ81iJQUREGlFTkoiINKLEICIijSgxiIhII0oMIiLSiBKDiIg0osQgIiKNKDGIiEgj/w9xdK7Acf0v9wAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f605c8ab7f0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同聚类数目的模型的性能，找到最佳模型／参数（分数最高）\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 看起来最佳的K应该是从2到10之间，重新做一遍上面的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 2\n",
      "CH_score: 0.5675647497172704\n",
      "K-means begin with clusters: 3\n",
      "CH_score: 0.4376082417512089\n",
      "K-means begin with clusters: 4\n",
      "CH_score: 0.4148334093343713\n",
      "K-means begin with clusters: 5\n",
      "CH_score: 0.15603500601118594\n",
      "K-means begin with clusters: 6\n",
      "CH_score: 0.006054934559972887\n",
      "K-means begin with clusters: 7\n",
      "CH_score: 0.27058118594714253\n",
      "K-means begin with clusters: 8\n",
      "CH_score: 0.02657993060648849\n",
      "K-means begin with clusters: 9\n",
      "CH_score: 0.20282759648602514\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 0.14579594529481765\n"
     ]
    }
   ],
   "source": [
    "CH_scores = []\n",
    "Ks = [2,3,4,5,6,7,8,9,10]\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, eventContMatrix)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7f604796fdd8>]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl81NX1//HXSQBld4FSBARUquKKjYD6K61SLYririBulEUkKIL9WrVWK9rWra7FACqCK1LcQKuIYrXaqgTFBZGKaCWiGNzRKiL398dJJMRAJsnM3Fnez8cjj2RmPsycxvTMnXvPPddCCIiISG4piB2AiIgkn5K7iEgOUnIXEclBSu4iIjlIyV1EJAcpuYuI5CAldxGRHKTkLiKSg5TcRURyUKNYL9ymTZvQpUuXWC8vIpKVFixYsCqE0La266Il9y5dulBaWhrr5UVEspKZ/TeR6zQtIyKSg5TcRURykJK7iEgOUnIXEclBSu4iIjlIyV1EJAcpuYuI5KCsS+5vvgnnnQc6HVBEZOOyLrnPmgWXXQZ/+lPsSEREMle0Har1NW4cLFwIF1wAO+8MRx0VOyIRkcyTdSN3M7jpJujdG046CV56KXZEIiKZJ+uSO8Dmm8P998NWW8GAAfDBB7EjEhHJLFmZ3AF+/GOff//4YzjiCPj669gRiYhkjqxN7gA9esDtt8Pzz8OwYaqgERGplNXJHXxB9ZJL4M47vYpGRESysFqmJr/7Hbz+Opx/vlfQHHFE7IhEROLK+pE7eAXNLbfA3nvDiSfCyy/HjkhEJK6cSO4ATZvCgw/CFlvAYYfBypWxIxIRiSdnkjtA+/ZeQbNqlc/Ff/NN7IhEROLIqeQOsNdecNtt8K9/wYgRqqARkfyUc8kd4Jhj4OKLPclfeWXsaERE0i8nqmVq8vvfw6JFcO65sNNOvpNVRCRf5OTIHbyC5tZb4ac/hcGD4dVXY0ckIpI+OZvcAZo1gwcegJYtvYLmww9jRyQikh45ndwBOnTwEsmVK1VBIyL5I+eTO/jmpqlT4dlnYeRIVdCISO7L2QXV6o4/3lsUjB8Pu+wCv/lN7IhERFInb5I7wEUXeYI/5xzvQdO/f+yIRERSI6FpGTPrZ2ZLzGypmZ1bw+Onmlm5mS2s+BqW/FAbrqAApk3zVsGDBnmppIhILqo1uZtZITABOBjoDgwys+41XHpPCGHPiq+bkxxn0jRr5guszZt7Bc2qVbEjEhFJvkRG7j2BpSGEZSGENcB04PDUhpVaHTt6ieSKFXD00bBmTeyIRESSK5Hk3gFYXuV2WcV91R1tZq+Y2Uwz65SU6FKoVy+YMgWefhpGjVIFjYjklkSSu9VwX/VUOBvoEkLYHXgcmFbjE5mNMLNSMystLy+vW6QpcMIJftDHLbfAtdfGjkZEJHkSSe5lQNWReEdgRdULQggfhRAqtwfdBPy0picKIUwOIRSFEIratm1bn3iTbvx4OPJIL4185JHY0YiIJEciyX0+0M3MuppZE2AgMKvqBWbWvsrNAcDi5IWYWgUFfsj27rvDwIFeKikiku1qTe4hhLXAaGAOnrRnhBAWmdl4M6vstXimmS0ys5eBM4FTUxVwKjRv7od8NG2qChoRyQ0WIq0kFhUVhdLS0iivvTHPPQe/+AX07g2PPQZNmsSOSERkQ2a2IIRQVNt1edFbJlG9e/vi6lNPwejRqqARkeyVV+0HEjF4sO9c/fOfYddd4cwzY0ckIlJ3GrnX4NJL4fDDYexYmDMndjQiInWn5F6DggK44w4fuR9/PLzxRuyIRETqRsl9I1q08AqazTbzCpqPP44dkYhI4pTcN6FzZ7j/fnj3XTj2WPj229gRiYgkRsm9FvvuCzfdBPPm+eKqKmhEJBuoWiYBJ5/sFTRXXOGnOI0eHTsiEZFN08g9QX/6k8+9n3UWzJ0bOxoRkU1Tck9QYSHceSd07+7z70uWxI5IRGTjlNzroGVLr6Bp0sRH8Z98EjsiEZGaKbnXUZcucN998M47cNxxqqARkcyk5F4P/+//waRJ8PjjvotVRCTTqFqmnoYM8Qqav/zFK2hOPz12RCIi62nk3gCXXw79+8MZZ8ATT8SORkRkPSX3BigshLvugp128gqaN9+MHZGIiFNyb6BWrbyCpqAADjwQzj3XF1zLyrSbVUTi0Zx7Emy3HcyeDWPGwNVXr6+gad8eevZc/1VUBFtsETdWEckPSu5Jss8+8MIL8PXX8PLL/vMLL8D8+fDgg+uv23HHDRP+Hnt450kRkWRSck+yzTeHXr38q9Knn0Jp6fqEP3cu3H67P9a4sSf4qgl/xx19mkdEpL50QHYEIcB7761P9pUj/NWr/fFWrXwKp2rC79AhbswikhkSPSBbI/cIzKBjR/866ii/77vvvF9N1YR/1VWwdq0/vs02P5y/b9063v8GEclsGrlnsK+/hoULN0z4Vcstd9ppw4S/++6avxfJdRq554DNN4fevf2r0scfr5+/nz/fD/C+7TZ/rEkT2HPP9cl+773hJz/R/L1IPtLIPcuF4DX1VUf3paUbzt8fdBBMmwbNmsWNVUQaTiP3PGEGnTr519FH+33ffQdvvOGJ/umnYepU32A1YkTUUEUkjTRyz3EhQI8e/n3hQn8zEJHslejIPaHZWDPrZ2ZLzGypmZ27ieuOMbNgZrW+sKSHGRQXwyuvwLPPxo5GRNKl1uRuZoXABOBgoDswyMy613BdS+BM4PlkBykNc8IJXjY5YULsSEQkXRIZufcEloYQloUQ1gDTgcNruO4S4Arg6yTGJ0nQvLn3n7/3Xvjgg9jRiEg6JJLcOwDLq9wuq7jve2bWA+gUQnhoU09kZiPMrNTMSsvLy+scrNTfqFHe0Oymm2JHIiLpkEhyr2kJ7vtVWDMrAK4Bzq7tiUIIk0MIRSGEorZt2yYepTRYt25eEjlp0vpdryKSuxJJ7mVApyq3OwIrqtxuCewK/MPM3gF6A7O0qJp5iou9p82sWbEjEZFUSyS5zwe6mVlXM2sCDAS+Tw8hhM9CCG1CCF1CCF2A54ABIQTVOWaY/v2hc2ctrIrkg1qTewhhLTAamAMsBmaEEBaZ2XgzG5DqACV5Cgth5EiYNw8WL44djYikkjYx5Znycu9GOWIE3HBD7GhEpK6SuolJckfbtnDccd5r5osvYkcjIqmi5J6Hios9sd9xR+xIRCRVlNzzUK9esNdevrAaaVZORFJMyT0PVfabWbTIu0aKSO5Rcs9TAwfClluqLFIkVym556lmzeDXv4b774cVK2q/XkSyi5J7Hjv9dD/YY/Lk2JGISLIpueex7beHfv08uX/7bexoRCSZlNzzXHExvP8+PPBA7EhEJJmU3PNcv37QtasWVkVyjZJ7niss9Ln3p56C116LHY2IJIuSuzBkCGy2Gdx4Y+xIRCRZlNyFNm287v322+Hzz2NHIyLJoOQugC+srl4Nt90WOxIRSQYldwFg773968Yb1W9GJBcoucv3iov9EI8nn4wdiYg0lJK7fO/442HrrVUWKZILlNzle5tvDkOHwoMPQllZ7GhEpCGU3GUDI0fCunUwaVLsSESkIZTcZQNdu0L//nDTTbBmTexoRKS+lNzlB4qLYeVKuO++2JGISH0pucsPHHSQd4zUwqpI9lJylx8oKPB+M888A6+8EjsaEakPJXep0ZAhXj2j0btIdlJylxpttRWccALccQd8+mnsaESkrpTcZaOKi+Grr2DatNiRiEhdJZTczayfmS0xs6Vmdm4Nj480s1fNbKGZPWNm3ZMfqqTbXntB797eb2bdutjRiEhd1JrczawQmAAcDHQHBtWQvO8KIewWQtgTuAK4OumRShTFxfCf/8ATT8SORETqIpGRe09gaQhhWQhhDTAdOLzqBSGEql3AmwPqK5gjjj0W2rbVwqpItkkkuXcAlle5XVZx3wbMrNjM3sJH7mfW9ERmNsLMSs2stLy8vD7xSpptthkMGwazZ8O778aORkQSlUhytxru+8HIPIQwIYSwPfBb4IKaniiEMDmEUBRCKGrbtm3dIpVoRo707+o3I5I9EknuZUCnKrc7Ais2cf104IiGBCWZZdtt4bDDvN/MN9/EjkZEEpFIcp8PdDOzrmbWBBgIzKp6gZl1q3KzP/Bm8kKUTFBcDOXlMHNm7EhEJBG1JvcQwlpgNDAHWAzMCCEsMrPxZjag4rLRZrbIzBYC44BTUhaxRNG3L3TrpoVVkWzRKJGLQgh/B/5e7b4Lq/w8JslxSYYpKIBRo2DsWHjpJejRI3ZEIrIp2qEqCTv1VGjWTKN3kWyg5C4J22ILGDwY7roLPvkkdjQisilK7lInxcXwv//BrbfGjkRENkXJXepkjz1gv/3Ub0Yk0ym5S50VF8Nbb8Fjj8WOJH8sWQI77AALF8aORLKFkrvU2dFHQ7t2WlhNp+uu8zfUyy6LHYlkCyV3qbMmTWD4cHj4YXj77djR5L4vvoDbb/eTsWbOVI8fSYySu9TLaad57bv6zaTeXXfB6tUwdSqEAH/9a+yIJBsouUu9dOwIAwbAzTfD11/HjiZ3hQAlJb6QfdxxPiU2ebIne5FNUXKXeisuho8+ghkzYkeSu55/Hl5+2TtzmsG4cfDZZypFldopuUu9HXAA7LSTFlZTaeJEaNHCN4+BH3vYu7cvsH73XdzYJLMpuUu9mXm/mRdegNLS2NHkno8/hnvugRNPhJYt198/bpxXzsyeHS82yXxK7tIgJ58MzZtr9J4KU6f6esbpp294/5FHQufOcM01UcKSLKHkLg3SujWcdBJMn+7z75IcIfiUzL77wu67b/hYo0Zwxhnw9NOwYEGc+CTzKblLgxUX+whzypTYkeSOefPgzTfXH3FY3bBhPhev0btsjJK7NNiuu0KfPl6yp0W+5Jg4EbbaCo49tubHW7eGoUN9Tv6999Ibm2QHJXdJiuJi36366KOxI8l+778PDzwAQ4b4rtSNOfNMb96mTU1SEyV3SYojj4T27bWwmgw33wxr1/ou4E3Zbjs44gjfJfzll+mJTbKHkrskRePGMGKEj9zfeit2NNlr7VrfgXrggX5mbW3GjvWDU267LfWxSXZRcpekGT7c+81MnBg7kuz1979DWdnGF1Kr228/2HtvX1hVf32pSsldkqZDB5+emTLFT2uSups4EbbZBg47LLHrzXz0/uab/sYgUknJXZKquNh3Vk6fHjuS7FO5ID1smE9zJeqYY7yR29VXpy42yT5K7pJUP/857LKLL6yGEDua7DJ5sk9rDR9et3/XuLFvanrySZ3UJOspuUtSVfabWbDAe85IYr75Bm65xadjOnas+78fPhyaNYNrr01+bJKdlNwl6U46yRtdqSwycffdB+XliS+kVrfllvDrX/vBHu+/n9zYJDspuUvStWzpDcXuuccTltRu4kSvWz/wwPo/x5gxXkp5443Ji0uyV0LJ3cz6mdkSM1tqZufW8Pg4M3vdzF4xsyfMrHPyQ5VsMmoUrFnjUw2yaYsWeROwyqML62uHHXxap6RE1UqSQHI3s0JgAnAw0B0YZGbdq132ElAUQtgdmAlckexAJbt07w777+8jUvWb2bRJk/zQ8SFDGv5c48Z5d87bb2/4c2W7fD/+MZFxQk9gaQhhWQhhDTAdOLzqBSGEJ0MIX1XcfA6ox5KQ5JriYvjvf+Hhh2NHkrm+/BKmTfNyxrZtG/58ffpAjx6+sJrPm5pefNHXIfr2zd+F/USSewdgeZXbZRX3bcxQ4JGGBCW54fDDfUOOFlY37u674fPPf3ggR31VnrO6eDHMmZOc58w2333nC9PNm8Orr0KvXr65btGi2JGlVyLJ3Wq4r8YKZjM7ESgCrtzI4yPMrNTMSsu10pbzGjXyeeTHHvMdlPJDEyd6y+T99kvecx53nDdxy9de75Mnw/z5cP313ufo4ovhiSf80JNTT4V33okdYXokktzLgE5VbncEVlS/yMx+CfwOGBBC+KamJwohTA4hFIUQitom4zOoZLzhwz3Jl5TEjiTzzJ/v+wFGjvQRd7I0aQKjR8PcufDaa8l73mywciWcd54f3j5okFduXXghLFvmbRqmT4ef/MTbJa9cGTva1Eokuc8HuplZVzNrAgwEZlW9wMx6AJPwxP5h8sOUbNW+PRx9NNx6K3z1Ve3X55OJE33q4KSTkv/cp50GTZvm3+j9N7/xv7MJEzZ8w2zTBq66CpYu9dH7jTfC9tvDBRfAZ59FCzelak3uIYS1wGhgDrAYmBFCWGRm481sQMVlVwItgL+Z2UIzm7WRp5M8VFwMn37qG2zEffKJz7efcAK0apX85996azjlFLjzTvgwT4ZbTz4Jd9wB55wDO+1U8zUdO/q0zeuvw6GHwh//CF27whVX5N7gw0KkBiBFRUWhtLQ0ymtLeoUAe+wBhYVexZDMKYhsdf31vunoxRe9uiUVlizxJPeHP8BFF6XmNTLFmjX+N/bNN75w2rRpYv/upZfg/PO9Yds228Dvf+/HF9alcVu6mdmCEEJRbddph6qknJmP3hcuhH//O3Y08YXgUzK9eqUusQPsuCP07+9TFLle833VVfDGG37kYKKJHfz3/8gj8NRT0KWLVy3tvLN/ysz2UlIld0mLwYN9+kFlkZ5IFi+ufx+Zuhg71ltA5PKU2NtvwyWXwFFHwSGH1O85+vSBZ56Bhx7ydZDBg2GvvXyPRrZ2N1Vyl7Ro0cIXsv72t9yvUqjNxImwxRZw/PGpf60DDvASwKuvzt4ktSkheOVLYWHDO2Ka+Sedl17ytYrVq31e/mc/g3/+MznxppOSu6TNqFHw7bd+AHS+WrnSO0Ceemrdpg/qq/KkpkWL4PHHU/966fbggz7avvhi6NSp9usTUVDgC92LF3sJ77JlPrI/5JDs6pev5C5ps+OO8Mtf+sh17drY0cQxZYq/waVjSqbSoEHQrl3ulUWuXu2j9t128+/J1rix/3dauhQuvxyee87n6AcOzI5NeUruklbFxX4A9OzZsSNJv+++8yZhBxzgb3Tpstlm/nt/5BEfjeaK8eNh+XIfXaeyuqVZMy+vXLYMfvc7/9vdeWffS/Dee6l73YZScpe0OvRQ//icjz3H58zxRmrpHLVXGjnSk3yunNT02mv+SWTo0OS2btiULbaASy/1lgann+4b83bYAf7v/7wTZ6ZRcpe0quw38/jjXoedT0pK4Mc/hiOOSP9rt23rB6jcdhusWpX+10+mdes8ubZu7dMl6fbjH8MNN/jf73HHwV/+4getXHKJTxVlCiV3Sbthw/xjdD6N3itbH8fcIHPWWV7vPnFinNdPlmnTvGzxiit8J24sXbt6LK++6lNtF17oSf6663wzVWxK7pJ27drBscfC1Kne7jYf3HSTV66MGBEvhu7d4Ve/8r0GmZB86uOjj3waZL/9vOIoE+yyC9x/vy+47rabv4n+5Cf+9x3zoBold4li3Dj/CHvGGbEjSb01a7z885BDYNtt48Yybhx88IF3R8xG557rfYpKShp2JGEq9OrlrYXnzoUf/chP1tptNy99jbHHIMN+PZIvfvpT7+Nx2225vXsSvBZ75crkHcjREAce6CPNa67Jvk1N//qXv0mOHetJM1P98pd++tO99/rto4/2xJ/ufQZK7hLNBRf4x+uRI73MLFeVlHjfkl/9KnYkPjV01lnw8svwj3/EjiZxa9f6m2OnTtnRBM3M2yG88orvbVi50t9Y+/aF559PTwxK7hJNo0a+zbugwHt55OLGpjfe8Fa0I0b4FvlMMHiwV89cfXXsSBJ3/fWeKK+7zltZZItGjXx65j//8TLUV1+F3r292ibVlNwlqs6dfWPPc8/5FvJcM2mSV8f8+texI1mvaVMfBT/0kCedTFdW5qP1/v3jlJEmw2abeYvnt97yzVcDBtT+bxpKyV2iO/54H9388Y/eMTFXfPWVV0wcdZRXCGWSUaP8OL7rrosdSe3OOsurTm64IfvPAmjZ0teaOndO/WspuUtGuP563+134onw8cexo0mOGTO8siMTFlKra9fOp2emTs3s3/cjj/jC5AUXeF25JE7JXTJCixZeNbNypR+qnW2VHDUpKfEeJH36xI6kZmPH+qeLyZNjR1Kz//3PD/reaSc/G1XqRsldMkZRkU/N3Hdf9rcFfvFFL4cbOTJzpxJ2283L9m64wWvxM82f/uRVVDfe6FNIUjdK7pJRzj7bE86YMV5pkq0mTvSFy5NPjh3Jpo0dCytW+CEqmWTJEu8bc+KJsP/+saPJTkruklEKCnxjU/Pm3oc8G7fJf/aZl3gOGuSdBDNZv34+7ZFJJzWF4Au+zZv72ahSP0ruknHat/eNHwsXwnnnxY6m7u64w+eyM3EhtbqCAq9GefHFzDlK7u67Yd48n5bJtCqjbGIh0tt1UVFRKC0tjfLakh3OOMNPs3/kER9hZoMQfC67aVOYPz92NIn56ivf+dmnjzfAiunTT/2TxLbbwr//nTkbvzKJmS0IIRTVdp1G7pKxrrgCdt0VTjklew7VfvZZP680xoEc9dWsmX/KePBBP1IupgsugPJyX7NQYm8YJXfJWE2b+kf0zz/39q7r1sWOqHYlJX6IxMCBsSOpm+Ji3yp//fXxYigt9cqY4mLYa694ceQKJXfJaLvu6ifdPPpo3MSTiPJymDnTK2SaN48dTd20b+9vSFOm+NRIun33nX/aadfOTzSShksouZtZPzNbYmZLzezcGh7vY2YvmtlaMzsm+WFKPjv9dO/F8dvfwksvxY5m42691evFs2lKpqqxY+HLL/1gkXSbOBEWLPBWxK1bp//1c1GtC6pmVgj8BzgQKAPmA4NCCK9XuaYL0Ar4DTArhDCzthfWgqrUxapVsMce0KqVf3zPtJHxunXQrRt07Jjd/XH239+bWy1b5tM06fDBB7DjjtCzJzz2WOZu+soUyVxQ7QksDSEsCyGsAaYDh1e9IITwTgjhFSALZkUlG7Vp4/XvS5b4CDPTzJ3rCTEbyh83Zdw4WL58/UET6XD22X6264QJSuzJlEhy7wAsr3K7rOI+kbTq2xfOOcenDdKZfBJRUuI90o86KnYkDdO/v38CSdempiee8J5Cv/2tnzsqyZNIcq/pvbRe/9nNbISZlZpZaXl5eX2eQvLcJZfA3nt7c7Hly2u/Ph3KymD2bBg6NPt7oBQUeOuHF17wOvNU+uYb34m63XbZuVkt0yWS3MuATlVudwRW1OfFQgiTQwhFIYSitm3b1ucpJM81buwjvTVr4KST4p4uX+mmm3yUO2JE7EiS49RTYcstU39S05VX+mEhEyZ42askVyLJfT7Qzcy6mlkTYCAwK7VhiWzcDjt4QnjqKbjssrixfPutJ/d+/XKn33jz5v5Gdf/98PbbqXmNZcu8A+gxx2TP7uNsU2tyDyGsBUYDc4DFwIwQwiIzG29mAwDMbG8zKwOOBSaZ2aJUBi1y8snemOuii1I/fbAps2fD++9n/0JqdaNH+xRNKs76DMGfv1EjP1dUUkO9ZSRrffYZ7Lmn/7xwYZz66AMP9Aqet9/Ove3ygwf7m1dZmZegJsu99/qI/eqrM7PyKdOpt4zkvNatff59+XJfmEv3OOXNN+Hxx30KI9cSO3ji/eILuOWW5D3nF1/4gu0ee3hjOEkdJXfJavvs41Mzd93lrXbTadIkn1oYOjS9r5suRUXws59524e1a5PznH/4A7z3npeOpmuTVL5Scpesd/75noRGjUpfV8P//c/bDRxxhPdlyVVjx8I778ADDzT8uV55Ba67zstY99mn4c8nm6bkLlmvsNBH7Y0awQkneAVLqs2cCR9/nHsLqdUNGOB16Ndc07DnWbfOe+5suWX8Cqd8oeQuOWHbbb0kcf58uPDC1L9eSYnvqMz18z0LC32O/F//guefr//zTJniVU1XXglbbZW8+GTjlNwlZxxzDAwb5gcrz5uXutd5+WVPVCNH5kcvlCFDvFqmvqP3Vau8vcDPfuYHr0h6KLlLTrn2Wh9Rn3SSJ5VUmDgRNt88fxJVy5ZeETRzJrz7bt3//Tnn+IErJSX58WaYKZTcJac0b+6nN61a5aP4ZJdHfvGFz+8ff3x+TS9Uli3WdVPTM8/4wvO4cbDLLsmPSzZOyV1yTo8evmj34IM+yk6mO++E1atzfyG1um23haOP9nWN1asT+zfffuu/p223Tc86iGxIyV1y0pgx8Ktf+YhxUZKaYYTgUwt77ukHS+SbceN8V/CttyZ2/bXXwmuveZ18ph2ukg+U3CUnFRTAtGm+EDhokB8G0VDPPee12qefnp9zx716eX36tdfW3o3z3Xd9w9Jhh8Hhh2/6WkkNJXfJWe3a+Sjz1Vd9Ua+hSkp8cfGEExr+XNlq7Fjv6Dh79qavGzPGP+lk+qHmuUzJXXLaIYd4ornhBnj44fo/z0cfwYwZXoXTokXy4ss2Rx4JnTtvutf7Qw/5jtYLL4QuXdIWmlSj5C457/LLvVHVqad6e976mDrVTw4aOTKZkWWfRo3gzDPhn/+EBQt++PhXX3llTffuPkcv8Si5S87bbDMvj/zyS69NX1fHY9zXrfMmYfvtB7vtlpoYs8nQoT49VdOmpksv9V40N96Y/UcOZjsld8kLO+/sC4Fz59b9+Lh587y9b76VP25M69ae4O+5xzs8Vlq8GK66yg9S+fnP48UnTsld8sbw4XDUUd5FsqYphY0pKYE2bby9gbgzz/RPNH/9q98Owbtytmjh/WMkPiV3yRtmvgnnRz/y8shENuOsWOGboYYM8ekdcV27+uLqpEk+3XXHHfCPf8Cf/+y/X4lPyV3yylZbeSJautSraGpz881e033aaamPLduMHQuffOLTXWef7XXww4fHjkoqKblL3vnFL3xqZsoUL2/cmLVrfaR/0EGw/fZpCy9r7Luv79S94AIvFS0p8c1jkhn0n0Ly0kUX+UhzxAj4739rvubhh/1waC2k1sxs/QHXZ5zhPX0kc+gUQ8lLjRv7uat77gmDB/t8cfUzPUtKoEMHOPTQKCFmheOO8yQ/YEDsSKQ6jdwlb223nSfwZ5+FP/5xw8eWLYM5c3wOWQc5b1xBgbc/bto0diRSnZK75LXBg72lwPjx3nu80qRJfsSpDaehAAAGK0lEQVTcsGHxYhNpCCV3yXt//av3QBk8GD791NsMTJniUw0dOsSOTqR+lNwl77Vq5e0JVqzwksd77/WTnLSQKtksoeRuZv3MbImZLTWzc2t4fDMzu6fi8efNrEuyAxVJpZ49fWpmxgzffbn99tC3b+yoROqv1uRuZoXABOBgoDswyMy6V7tsKPBJCGEH4Brg8mQHKpJq55zjNfAffeTdH1WzLdkskT/fnsDSEMKyEMIaYDpQ/WyVw4FpFT/PBPqa5eNZNZLNCgu9PPL8873+XSSbJZLcOwDLq9wuq7ivxmtCCGuBz4CtkxGgSDq1b+9lka1axY5EpGESSe41jcBDPa7BzEaYWamZlZaXlycSn4iI1EMiyb0M6FTldkdgxcauMbNGQGvg4+pPFEKYHEIoCiEUtW3btn4Ri4hIrRJJ7vOBbmbW1cyaAAOBWdWumQWcUvHzMcC8EMIPRu4iIpIetW6sDiGsNbPRwBygEJgSQlhkZuOB0hDCLOAW4HYzW4qP2AemMmgREdm0hLpmhBD+Dvy92n0XVvn5a+DY5IYmIiL1pUpeEZEcpOQuIpKDlNxFRHKQxSpqMbNyYCNn4NSqDbAqieEki+KqG8VVd5kam+Kqm4bE1TmEUGstebTk3hBmVhpCKIodR3WKq24UV91lamyKq27SEZemZUREcpCSu4hIDsrW5D45dgAbobjqRnHVXabGprjqJuVxZeWcu4iIbFq2jtxFRGQTsiq5m1knM3vSzBab2SIzGxM7JgAz29zMXjCzlyviujh2TFWZWaGZvWRmD8WOpZKZvWNmr5rZQjMrjR1PJTPbwsxmmtkbFX9n+2RATDtW/J4qvz43s7NixwVgZmMr/uZfM7O7zWzz2DEBmNmYipgWxfxdmdkUM/vQzF6rct9WZjbXzN6s+L5lKl47q5I7sBY4O4SwM9AbKK7hyL8YvgEOCCHsAewJ9DOz3pFjqmoMsDh2EDXYP4SwZ4aVql0HPBpC2AnYgwz4vYUQllT8nvYEfgp8BdwfOSzMrANwJlAUQtgVbywYvWmgme0KDMdPkdsDONTMukUKZyrQr9p95wJPhBC6AU9U3E66rEruIYT3QwgvVvz8Bf5/vOqnQqVdcKsrbjau+MqIxQwz6wj0B26OHUumM7NWQB+8yykhhDUhhE/jRvUDfYG3Qgj13QCYbI2AphXnODTjh2c9xLAz8FwI4auKk+GeAo6MEUgI4Wl+eLZF1WNJpwFHpOK1syq5V2VmXYAewPNxI3EVUx8LgQ+BuSGEjIgLuBY4B1gXO5BqAvCYmS0ws0w5sXQ7oBy4tWIa62Yzax47qGoGAnfHDgIghPAecBXwLvA+8FkI4bG4UQHwGtDHzLY2s2bAIWx44FBs7UII74MPWIEfpeJFsjK5m1kL4F7grBDC57HjAQghfFfxsbkj0LPio2FUZnYo8GEIYUHsWGqwXwhhL+BgfHqtT+yA8FHoXkBJCKEH8CUp+shcHxWH5QwA/hY7FoCKueLDga7ANkBzMzsxblQQQlgMXA7MBR4FXsandPNK1iV3M2uMJ/Y7Qwj3xY6nuoqP8f/gh/NsMewHDDCzd4DpwAFmdkfckFwIYUXF9w/x+eOecSMC/LjIsiqfumbiyT5THAy8GEJYGTuQCr8E3g4hlIcQvgXuA/aNHBMAIYRbQgh7hRD64NMib8aOqYqVZtYeoOL7h6l4kaxK7mZm+Hzo4hDC1bHjqWRmbc1si4qfm+J/9G/EjQpCCOeFEDqGELrgH+fnhRCij6zMrLmZtaz8GTgI/ygdVQjhA2C5me1YcVdf4PWIIVU3iAyZkqnwLtDbzJpV/H+zLxmwAA1gZj+q+L4tcBSZ9XureizpKcCDqXiRhE5iyiD7AScBr1bMbwOcX3FSVEztgWlmVoi/Yc4IIWRM2WEGagfc7/mARsBdIYRH44b0vTOAOyumQJYBQyLHA0DF3PGBwGmxY6kUQnjezGYCL+LTHi+ROTtC7zWzrYFvgeIQwicxgjCzu4FfAG3MrAy4CLgMmGFmQ/E3yJScYqcdqiIiOSirpmVERCQxSu4iIjlIyV1EJAcpuYuI5CAldxGRHKTkLiKSg5TcRURykJK7iEgO+v+C7GWmuRUxNgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f60479c31d0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 这么看来就是K=2是最优了，这样想来，不用再做一次，直接取最小值K=2就够了，其实不必要再画一次。。。不过这样也好，保险。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
